# Tables 8 in "Evaluating (weighted) dynamic treatment effects by double machine
# learning" by Hugo Bodory, Martin Huber, and Luk� Laff�rs 

rm(list = ls())

packages = c('SuperLearner','e1071','glmnet','ranger','xgboost','ggplot2','dplyr','parallel')
lapply(packages,FUN=function(packages) {do.call("require", list(packages))})

# SuperLearner algorithms
MLfunct=function(y, x, d1=NULL, d2=NULL, MLmethod="lasso",  ybin=0){
  if (is.null(d1)==0 & is.null(d2)==0) { y=y[d1==1 & d2==1]; x=x[d1==1 & d2==1,]}
  if (is.null(d1)==0 & is.null(d2)==1) { y=y[d1==1]; x=x[d1==1,]}
  cl <- makeCluster(16, type = "PSOCK") # can use different types here
  clusterSetRNGStream(cl, iseed = 2343)
  # make SL functions available on the clusters
  foo <- clusterEvalQ(cl, library(SuperLearner))  
  if  (MLmethod=="randomforest"){ 
    if (ybin==1) model=snowSuperLearner(cluster=cl,Y=y,X=x,family=binomial(),SL.library = "SL.ranger",verbose=TRUE) 
    if (ybin!=1) model=snowSuperLearner(cluster=cl,Y=y,X=x,family=gaussian(),SL.library = "SL.ranger",verbose=TRUE) 
  }
  if  (MLmethod=="lasso"){ 
    if (ybin==1) model=snowSuperLearner(cluster=cl,Y=y,X=x,family=binomial(),SL.library = "SL.glmnet",verbose=TRUE) 
    if (ybin!=1) model=snowSuperLearner(cluster=cl,Y=y,X=x,family=gaussian(),SL.library = "SL.glmnet",verbose=TRUE) 
  }
  stopCluster(cl)  
model        
}

# static treatment effect estimation
treatDML=function(y,d,x, s=NULL, dtreat=1, dcontrol=0, trim=0.01, MLmethod="lasso", k=3){
  dtre=1*(d==dtreat); dcon=1*(d==dcontrol)
  scorestreat=hdtreat(y=y,d=dtre, x=x, s=s, trim=trim, MLmethod=MLmethod, k=k)
  scorescontrol=hdtreat(y=y,d=dcon, x=x, s=s, trim=trim, MLmethod=MLmethod,k=k)
  trimmed=1*(scorescontrol[,7]+scorestreat[,7]>0)        #number of trimmed observations
  scorestreat=scorestreat[trimmed==0,]
  scorescontrol=scorescontrol[trimmed==0,]
  tscores=(scorestreat[,1]*scorestreat[,2]*(scorestreat[,3]-scorestreat[,4])/(scorestreat[,5])+scorestreat[,6]*scorestreat[,4])/mean(scorestreat[,6])
  cscores=(scorescontrol[,1]*scorescontrol[,2]*(scorescontrol[,3]-scorescontrol[,4])/(scorescontrol[,5])+scorescontrol[,6]*scorescontrol[,4])/mean(scorescontrol[,6])
  meantreat=mean(tscores)
  meancontrol=mean(cscores)
  effect=meantreat - meancontrol
  se=sqrt(mean((tscores-cscores-effect)^2)/length(tscores))
  pval= 2*pnorm((-1)*abs(effect/se))
  list(effect=effect, se=se, pval=pval, ntrimmed=sum(trimmed), meantreat=meantreat, 
       meancontrol=meancontrol, pstreat=scorestreat[,5], pscontrol=scorescontrol[,5])
}

# scores for static treatment effect estimation
hdtreat=function(y,d,x,s=NULL, trim=0.01, MLmethod="lasso", k=3){
  ybin=1*(length(unique(y))==2 & min(y)==0 & max(y)==1)  # check if binary outcome
  x=data.frame(x)
  stepsize=ceiling((1/k)*length(d))
  set.seed(1); idx= sample(length(d), replace=FALSE)
  score=c();
  # cross-fitting procedure that splits sample in training an testing data
  for (i in 1:k){
    tesample=idx[((i-1)*stepsize+1):(min((i)*stepsize,length(d)))]
    trsample=idx[-tesample]
    if (is.null(s)) {gte=rep(1,length(tesample)); ste=gte} #check if weighted estimation should be performed
    if (is.null(s)==0) {
      g=MLfunct(y=s[trsample], x=x[trsample,], MLmethod=MLmethod,  ybin=1)
      gte=predict(g, x[tesample,], onlySL = TRUE)$pred     #predict weighting function in test data
      ste=s[tesample]
    }
    # nuisance parameters
    ps=MLfunct(y=d[trsample], x=x[trsample,], MLmethod=MLmethod,  ybin=1)
    pste=predict(ps, x[tesample,], onlySL = TRUE)$pred     #predict propensity score in test data
    eydx=MLfunct(y=y[trsample], x=x[trsample,], d1=d[trsample], MLmethod=MLmethod, ybin=ybin)
    eydxte=predict(eydx, x[tesample,], onlySL = TRUE)$pred  #predict conditional outcome in test data
    trimmed=1*((pste)<trim) # observations not satisfying trimming restriction
    score=rbind(score, cbind(gte,d[tesample],y[tesample],eydxte,pste, ste,trimmed))
  }
  score = score[order(idx),]
  score
}

# data
df = read.csv(file='data.csv', header=TRUE)
r = df[,1] # random assignment indicator, not used for table 8
d = df[,2:3] # d1,d2: treatments in 2 periods
y = df[,4] # employment: outcome
x0 = df[,5:913] # x0: covariates

# data frame for effect estimation shown in table 8
table_8 = data.frame(matrix(0, nrow=1, ncol=5))
colnames(table_8) = c('eff','se','p','n','trim')

# index for subset of data: treatment sequences 00 and 11, see notes of tables 8 
# drop observations with missing outcome values
idx_0 = (d[1]==0) & (d[2]==0)  # controls
idx_1 = (d[1]==1) & (d[2]==1)  # pseudo treated
idx = (y!=-1) & ((idx_0)|(idx_1))

# subset of data
y = y[idx]
d = 1*idx_1[idx];
x = x0[idx,]

# effect estimation
results = treatDML(y,d,x,s=NULL,dtreat=1,dcontrol=0,trim=0.01,MLmethod="randomforest",k=3)

# results presented in table 8
table_8[1,1:5] = c(round(results$effect, 3), round(results$se, 2), 
                   round(results$pval, 2), length(y), results$ntrimmed)
print(table_8)
